import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt
import re
import seaborn as sns
from textblob import TextBlob
from wordcloud import WordCloud
import plotly.express as px
import string
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth = 500
C:\Users\Serhan\anaconda3\lib\site-packages\numpy\_distributor_init.py:30: UserWarning: loaded more than 1 DLL from .libs:
C:\Users\Serhan\anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\Users\Serhan\anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
warnings.warn("loaded more than 1 DLL from .libs:\n%s" %
All COVID-19 Vaccines Tweets
# DATA TAKEN FROM https://www.kaggle.com/gpreda/all-covid19-vaccines-tweets/code
df = pd.read_csv("vaccination_all_tweets.csv", quotechar='"', delimiter=',')
df.head()
| id | user_name | user_location | user_description | user_created | user_followers | user_friends | user_favourites | user_verified | date | text | hashtags | source | retweets | favorites | is_retweet | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1340539111971516416 | Rachel Roh | La Crescenta-Montrose, CA | Aggregator of Asian American news; scanning diverse sources 24/7/365. RT's, Follows and 'Likes' will fuel me 👩💻 | 2009-04-08 17:52:46 | 405 | 1692 | 3247 | False | 2020-12-20 06:06:44 | Same folks said daikon paste could treat a cytokine storm #PfizerBioNTech https://t.co/xeHhIMg1kF | ['PfizerBioNTech'] | Twitter for Android | 0 | 0 | False |
| 1 | 1338158543359250433 | Albert Fong | San Francisco, CA | Marketing dude, tech geek, heavy metal & '80s music junkie. Fascinated by meteorology and all things in the cloud. Opinions are my own. | 2009-09-21 15:27:30 | 834 | 666 | 178 | False | 2020-12-13 16:27:13 | While the world has been on the wrong side of history this year, hopefully, the biggest vaccination effort we've ev… https://t.co/dlCHrZjkhm | NaN | Twitter Web App | 1 | 1 | False |
| 2 | 1337858199140118533 | eli🇱🇹🇪🇺👌 | Your Bed | heil, hydra 🖐☺ | 2020-06-25 23:30:28 | 10 | 88 | 155 | False | 2020-12-12 20:33:45 | #coronavirus #SputnikV #AstraZeneca #PfizerBioNTech #Moderna #Covid_19 Russian vaccine is created to last 2-4 years… https://t.co/ieYlCKBr8P | ['coronavirus', 'SputnikV', 'AstraZeneca', 'PfizerBioNTech', 'Moderna', 'Covid_19'] | Twitter for Android | 0 | 0 | False |
| 3 | 1337855739918835717 | Charles Adler | Vancouver, BC - Canada | Hosting "CharlesAdlerTonight" Global News Radio Network. Weeknights 7 Pacific-10 Eastern - Email comments/ideas to charles@charlesadlertonight.ca | 2008-09-10 11:28:53 | 49165 | 3933 | 21853 | True | 2020-12-12 20:23:59 | Facts are immutable, Senator, even when you're not ethically sturdy enough to acknowledge them. (1) You were born i… https://t.co/jqgV18kch4 | NaN | Twitter Web App | 446 | 2129 | False |
| 4 | 1337854064604966912 | Citizen News Channel | NaN | Citizen News Channel bringing you an alternative news source from citizen journalists that haven't sold out. Real news & real views | 2020-04-23 17:58:42 | 152 | 580 | 1473 | False | 2020-12-12 20:17:19 | Explain to me again why we need a vaccine @BorisJohnson @MattHancock #whereareallthesickpeople #PfizerBioNTech… https://t.co/KxbSRoBEHq | ['whereareallthesickpeople', 'PfizerBioNTech'] | Twitter for iPhone | 0 | 0 | False |
print(df.columns)
print(df.shape)
Index(['id', 'user_name', 'user_location', 'user_description', 'user_created',
'user_followers', 'user_friends', 'user_favourites', 'user_verified',
'date', 'text', 'hashtags', 'source', 'retweets', 'favorites',
'is_retweet'],
dtype='object')
(88978, 16)
# I just need tweets and dates
df=df[['date','text']].dropna()
df['date'] = pd.to_datetime(df['date']).dt.date
df.head()
| date | text | |
|---|---|---|
| 0 | 2020-12-20 | Same folks said daikon paste could treat a cytokine storm #PfizerBioNTech https://t.co/xeHhIMg1kF |
| 1 | 2020-12-13 | While the world has been on the wrong side of history this year, hopefully, the biggest vaccination effort we've ev… https://t.co/dlCHrZjkhm |
| 2 | 2020-12-12 | #coronavirus #SputnikV #AstraZeneca #PfizerBioNTech #Moderna #Covid_19 Russian vaccine is created to last 2-4 years… https://t.co/ieYlCKBr8P |
| 3 | 2020-12-12 | Facts are immutable, Senator, even when you're not ethically sturdy enough to acknowledge them. (1) You were born i… https://t.co/jqgV18kch4 |
| 4 | 2020-12-12 | Explain to me again why we need a vaccine @BorisJohnson @MattHancock #whereareallthesickpeople #PfizerBioNTech… https://t.co/KxbSRoBEHq |
df = df.drop_duplicates('text')
print(df.shape)
(88860, 2)
# I will delete unwanted content from tweets
texts=df.text
#Removing URLs from tweets
remove_url=lambda x:re.sub(r'http\S+','',str(x))
texts_lr=texts.apply(remove_url)
#Converting all tweets to lowercase
to_lower=lambda x: x.lower()
texts_lr_lc=texts_lr.apply(to_lower)
texts_lr_lc
#Removing punctuations
remove_puncs= lambda x:x.translate(str.maketrans('','',string.punctuation))
texts_lr_lc_np=texts_lr_lc.apply(remove_puncs)
texts_lr_lc_np
#Removing stopwords
more_words=['say','going','like','U','u','#coronavirus', '#coronavirusoutbreak', '#coronavirusPandemic', '#covid19', '#covid_19','coronavirus', 'covid19']
stop_words=set(stopwords.words('english')) #nltk package
stop_words.update(more_words)
remove_words=lambda x: ' '.join([word for word in x.split() if word not in stop_words]) #.join is from package string
texts_lr_lc_np_ns=r=texts_lr_lc_np.apply(remove_words)
texts_lr_lc_np_ns
df.text=texts_lr_lc_np_ns
#Removing emojis
def remove_emoji(text):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
df['text']=df['text'].apply(lambda x: remove_emoji(x))
display(df.head())
| date | text | |
|---|---|---|
| 0 | 2020-12-20 | folks said daikon paste could treat cytokine storm pfizerbiontech |
| 1 | 2020-12-13 | world wrong side history year hopefully biggest vaccination effort weve ev… |
| 2 | 2020-12-12 | sputnikv astrazeneca pfizerbiontech moderna russian vaccine created last 24 years… |
| 3 | 2020-12-12 | facts immutable senator even youre ethically sturdy enough acknowledge 1 born i… |
| 4 | 2020-12-12 | explain need vaccine borisjohnson matthancock whereareallthesickpeople pfizerbiontech… |
# Sentiment analysis with TextBlob. Polarity ranges from -1 to +1.
df['polarity'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
df
| date | text | polarity | |
|---|---|---|---|
| 0 | 2020-12-20 | folks said daikon paste could treat cytokine storm pfizerbiontech | 0.000 |
| 1 | 2020-12-13 | world wrong side history year hopefully biggest vaccination effort weve ev… | -0.500 |
| 2 | 2020-12-12 | sputnikv astrazeneca pfizerbiontech moderna russian vaccine created last 24 years… | 0.000 |
| 3 | 2020-12-12 | facts immutable senator even youre ethically sturdy enough acknowledge 1 born i… | 0.100 |
| 4 | 2020-12-12 | explain need vaccine borisjohnson matthancock whereareallthesickpeople pfizerbiontech… | 0.000 |
| ... | ... | ... | ... |
| 88973 | 2021-05-24 | india’s panacea biotec started producing sputnikv russian vaccine first batch produced t… | 0.125 |
| 88974 | 2021-05-24 | globeandmail canada stop politicizing vaccine toronto ontario ruined lockdown closed business school st… | -0.100 |
| 88975 | 2021-05-24 | look arvindkejriwal uselesscm resignnow kejriwalfailsdelhi vaccineshortage sputnikv coronavaccine mumbai… | 0.000 |
| 88976 | 2021-05-24 | globalnewsto canada stop politicizing vaccine toronto ontario ruined lockdown closed business school st… | -0.100 |
| 88977 | 2021-05-24 | rdif panacea biotec start sputnik v production india sputnikv | 0.000 |
88860 rows × 3 columns
# Some of the most positive tweets:
df.sort_values(by='polarity', ascending=False)[['text', 'polarity']].reset_index(drop=True).head(n=10)
| text | polarity | |
|---|---|---|
| 0 | himantabiswa best part honble narendramodi ji taken jab indias indigenous vaccine covaxin by… | 1.0 |
| 1 | pfizer jab morning efficient wellorganised thank wonderful doctor gav… | 1.0 |
| 2 | pfizer ocugen best vaccine covaxin bharatbiotech | 1.0 |
| 3 | novavax uniofoxford ocugen best vaccine covaxin bharatbiotech | 1.0 |
| 4 | sputnikvaccine americans come russia get sputnikv it’s best vaccine | 1.0 |
| 5 | titanic sinking thank best cm amp vasooli gang lockdownmaharashtra coronavirus… | 1.0 |
| 6 | vaccine best world trust vaccine jai hindcovaxin | 1.0 |
| 7 | lokeshwarri bldatastories annapurani93 businessline srirags best vaccine indian sputnikv … | 1.0 |
| 8 | krysboydthink awesome team moderna | 1.0 |
| 9 | wonderful husband educator became eligible get vaccine asked motivation and… | 1.0 |
# Some of the most negative tweets:
df.sort_values(by='polarity', ascending=True)[['text', 'polarity']].reset_index(drop=True).head(n=10)
| text | polarity | |
|---|---|---|
| 0 | vaccinediplomacy gets underway prospects entirely grim sciencediplomacy sputnikv… | -1.0 |
| 1 | vonderleyen evil face would resemble vaccine virus eu europeanunion… | -1.0 |
| 2 | awful may rest peace may family get justice deserve vaccine jandj… | -1.0 |
| 3 | abc7newsbayarea course send faulty vaccines neighborswhy jnj moderna pfizervaccine pathetic po 🤣 | -1.0 |
| 4 | moderna side effects terrible head aches sore arminjection site head aches may bc i… | -1.0 |
| 5 | indias terrifying indian death toll hits worlds highest… | -1.0 |
| 6 | day 3 worst headache life moderna covidvaccine | -1.0 |
| 7 | come eu countries get together save lifes dreadful stop pop at… | -1.0 |
| 8 | mjlleghari pathetic managed online appointments chughtai muc… | -1.0 |
| 9 | got secondvaccination yesterday amp feel awful vaccinated covidvaccine moderna | -1.0 |
# Distribution of tweets by polarity
fig = plt.figure(figsize=(10, 6))
df['polarity'].hist()
plt.xlabel('Polarity Score', fontsize=18)
plt.ylabel('Frequency', fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
(array([ 0., 10000., 20000., 30000., 40000., 50000., 60000., 70000.]), [Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, '')])
# I will divide the data into 3 parts for easier processing.
criteria = [df['polarity'].between(-1, -0.01), df['polarity'].between(-0.01, 0.01), df['polarity'].between(0.01, 1)]
values = ['negative', 'neutral', 'positive']
df['sentiment'] = np.select(criteria, values, 0)
fig = plt.figure(figsize=(10, 6))
df['sentiment'].value_counts().sort_index().plot.bar()
plt.xlabel('Sentiment Label', fontsize=18)
plt.ylabel('Tweet Count', fontsize=18)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()
plt.tight_layout()
<Figure size 432x288 with 0 Axes>
from wordcloud import WordCloud, ImageColorGenerator
import wordninja
from spellchecker import SpellChecker
from collections import Counter
import nltk
import math
import random
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words.add("amp")
def flatten_list(l):
return [x for y in l for x in y]
def is_acceptable(word: str):
return word not in stop_words and len(word) > 2
# Color coding our wordclouds
def red_color_func(word, font_size, position, orientation, random_state=None,**kwargs):
return f"hsl(0, 100%, {random.randint(25, 75)}%)"
def green_color_func(word, font_size, position, orientation, random_state=None,**kwargs):
return f"hsl({random.randint(90, 150)}, 100%, 30%)"
def yellow_color_func(word, font_size, position, orientation, random_state=None,**kwargs):
return f"hsl(42, 100%, {random.randint(25, 50)}%)"
# Reusable function to generate word clouds
def generate_word_clouds(neg_doc, neu_doc, pos_doc):
# Display the generated image:
fig, axes = plt.subplots(1,3, figsize=(20,10))
wordcloud_neg = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(" ".join(neg_doc))
axes[0].imshow(wordcloud_neg.recolor(color_func=red_color_func, random_state=3), interpolation='bilinear')
axes[0].set_title("Negative Words")
axes[0].axis("off")
wordcloud_neu = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(" ".join(neu_doc))
axes[1].imshow(wordcloud_neu.recolor(color_func=yellow_color_func, random_state=3), interpolation='bilinear')
axes[1].set_title("Neutral Words")
axes[1].axis("off")
wordcloud_pos = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(" ".join(pos_doc))
axes[2].imshow(wordcloud_pos.recolor(color_func=green_color_func, random_state=3), interpolation='bilinear')
axes[2].set_title("Positive Words")
axes[2].axis("off")
plt.tight_layout()
# plt.show();
return fig
def get_top_percent_words(doc, percent):
# Returns a list of "top-n" most frequent words in a list
top_n = int(percent * len(set(doc)))
counter = Counter(doc).most_common(top_n)
top_n_words = [x[0] for x in counter]
# print(top_n_words)
return top_n_words
def clean_document(doc):
spell = SpellChecker()
lemmatizer = WordNetLemmatizer()
# Lemmatize words (needed for calculating frequencies correctly )
doc = [lemmatizer.lemmatize(x) for x in doc]
# Get the top 10% of all words. This may include "misspelled" words
top_n_words = get_top_percent_words(doc, 0.1)
# Get a list of misspelled words
misspelled = spell.unknown(doc)
# Accept the correctly spelled words and top_n words
clean_words = [x for x in doc if x not in misspelled or x in top_n_words]
# Try to split the misspelled words to generate good words (ex. "lifeisstrange" -> ["life", "is", "strange"])
words_to_split = [x for x in doc if x in misspelled and x not in top_n_words]
split_words = flatten_list([wordninja.split(x) for x in words_to_split])
# Some splits may be nonsensical, so reject them ("llouis" -> ['ll', 'ou', "is"])
clean_words.extend(spell.known(split_words))
return clean_words
def get_log_likelihood(doc1, doc2):
doc1_counts = Counter(doc1)
doc1_freq = {
x: doc1_counts[x]/len(doc1)
for x in doc1_counts
}
doc2_counts = Counter(doc2)
doc2_freq = {
x: doc2_counts[x]/len(doc2)
for x in doc2_counts
}
doc_ratios = {
# 1 is added to prevent division by 0
x: math.log((doc1_freq[x] +1 )/(doc2_freq[x]+1))
for x in doc1_freq if x in doc2_freq
}
top_ratios = Counter(doc_ratios).most_common()
top_percent = int(0.1 * len(top_ratios))
return top_ratios[:top_percent]
# Function to generate a document based on likelihood values for words
def get_scaled_list(log_list):
counts = [int(x[1]*100000) for x in log_list]
words = [x[0] for x in log_list]
cloud = []
for i, word in enumerate(words):
cloud.extend([word]*counts[i])
# Shuffle to make it more "real"
random.shuffle(cloud)
return cloud
Word cloud of the most frequently used words by emotion
def get_smart_clouds(df):
neg_doc = flatten_list(df[df['sentiment']=='negative']['words'])
neg_doc = [x for x in neg_doc if is_acceptable(x)]
pos_doc = flatten_list(df[df['sentiment']=='positive']['words'])
pos_doc = [x for x in pos_doc if is_acceptable(x)]
neu_doc = flatten_list(df[df['sentiment']=='neutral']['words'])
neu_doc = [x for x in neu_doc if is_acceptable(x)]
# Clean all the documents
neg_doc_clean = clean_document(neg_doc)
neu_doc_clean = clean_document(neu_doc)
pos_doc_clean = clean_document(pos_doc)
# Combine classes B and C to compare against A (ex. "positive" vs "non-positive")
top_neg_words = get_log_likelihood(neg_doc_clean, flatten_list([pos_doc_clean, neu_doc_clean]))
top_neu_words = get_log_likelihood(neu_doc_clean, flatten_list([pos_doc_clean, neg_doc_clean]))
top_pos_words = get_log_likelihood(pos_doc_clean, flatten_list([neu_doc_clean, neg_doc_clean]))
# Generate syntetic a corpus using our loglikelihood values
neg_doc_final = get_scaled_list(top_neg_words)
neu_doc_final = get_scaled_list(top_neu_words)
pos_doc_final = get_scaled_list(top_pos_words)
# Visualise our synthetic corpus
fig = generate_word_clouds(neg_doc_final, neu_doc_final, pos_doc_final)
return fig
# Convert string to a list of words
wordcloud_df = df
wordcloud_df['words'] = wordcloud_df.text.apply(lambda x:re.findall(r'\w+', x ))
get_smart_clouds(wordcloud_df).savefig("sentiment_wordclouds.png", bbox_inches="tight")
# how many tweets by time, let's see visually
timeline = df.groupby(['date']).count().reset_index()
timeline['count'] = timeline['text']
timeline = timeline[['date', 'count']]
fig = px.bar(timeline, x='date', y='count', labels={'date': 'Date', 'count': 'Tweet Count'})
fig.show()
# I'm scaling it to better see how positive/negative tweets change over time
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_positive=df[df['sentiment']=='positive'][['date','polarity']].groupby(['date']).mean().reset_index()
df_negative=df[df['sentiment']=='negative'][['date','polarity']].groupby(['date']).mean().reset_index()
df_positive['Positive_scale'] = scaler.fit_transform(df_positive[["polarity"]])
df_negative['Negative_scale'] = scaler.fit_transform(df_negative[["polarity"]])
df_positive.drop(columns=['polarity'], inplace=True)
df_negative.drop(columns=['polarity'], inplace=True)
fig = px.bar(df_positive, x='date', y='Positive_scale',title="Positivity Rates of Tweets")
fig.show()
fig = px.bar(df_negative, x='date', y='Negative_scale',title="Negativity Rates of Tweets")
fig.show()
COVID-19 World Vaccination Progress
# DATA TAKEN FROM https://www.kaggle.com/gpreda/covid-world-vaccination-progress
df_vaccine = pd.read_csv("country_vaccinations.csv")
df_vaccine.head()
| country | iso_code | date | total_vaccinations | people_vaccinated | people_fully_vaccinated | daily_vaccinations_raw | daily_vaccinations | total_vaccinations_per_hundred | people_vaccinated_per_hundred | people_fully_vaccinated_per_hundred | daily_vaccinations_per_million | vaccines | source_name | source_website | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | AFG | 2021-02-22 | 0.0 | 0.0 | NaN | NaN | NaN | 0.0 | 0.0 | NaN | NaN | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing | World Health Organization | https://covid19.who.int/ |
| 1 | Afghanistan | AFG | 2021-02-23 | NaN | NaN | NaN | NaN | 1367.0 | NaN | NaN | NaN | 35.0 | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing | World Health Organization | https://covid19.who.int/ |
| 2 | Afghanistan | AFG | 2021-02-24 | NaN | NaN | NaN | NaN | 1367.0 | NaN | NaN | NaN | 35.0 | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing | World Health Organization | https://covid19.who.int/ |
| 3 | Afghanistan | AFG | 2021-02-25 | NaN | NaN | NaN | NaN | 1367.0 | NaN | NaN | NaN | 35.0 | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing | World Health Organization | https://covid19.who.int/ |
| 4 | Afghanistan | AFG | 2021-02-26 | NaN | NaN | NaN | NaN | 1367.0 | NaN | NaN | NaN | 35.0 | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing | World Health Organization | https://covid19.who.int/ |
df_vaccine=df_vaccine[['country','date','total_vaccinations','daily_vaccinations','vaccines']]
df_vaccine.head()
| country | date | total_vaccinations | daily_vaccinations | vaccines | |
|---|---|---|---|---|---|
| 0 | Afghanistan | 2021-02-22 | 0.0 | NaN | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing |
| 1 | Afghanistan | 2021-02-23 | NaN | 1367.0 | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing |
| 2 | Afghanistan | 2021-02-24 | NaN | 1367.0 | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing |
| 3 | Afghanistan | 2021-02-25 | NaN | 1367.0 | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing |
| 4 | Afghanistan | 2021-02-26 | NaN | 1367.0 | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing |
Visualization of daily vaccination data over time
timeline2 = df_vaccine.groupby(['date']).daily_vaccinations.sum()
fig = px.line(timeline2,labels={'date': 'Date', 'value': 'Daily Vaccinations'})
fig.show()
Visualization of Total vaccination data over time
timeline3 = df_vaccine.groupby(['date']).total_vaccinations.sum()
fig = px.line(timeline3,labels={'date': 'Date', 'value': 'Total Vaccinations'})
fig.show()
# Merge polarity of tweets and vaccinations data then scale all
df_daily=timeline2.to_frame()
df_total=timeline3.to_frame()
df_daily.reset_index(drop=False, inplace=True)
df_total.reset_index(drop=False, inplace=True)
df_total['date'] = pd.to_datetime(df_total['date']).dt.date
df_daily['date'] = pd.to_datetime(df_daily['date']).dt.date
df_all=df_daily.merge(df_total)
df_all=df_all.merge(df_positive)
df_all=df_all.merge(df_negative)
df_all['total_vaccinations'] = scaler.fit_transform(df_all[["total_vaccinations"]])
df_all['daily_vaccinations'] = scaler.fit_transform(df_all[["daily_vaccinations"]])
df_all
| date | daily_vaccinations | total_vaccinations | Positive_scale | Negative_scale | |
|---|---|---|---|---|---|
| 0 | 2020-12-12 | 0.000027 | 0.000000e+00 | 0.237758 | 0.482988 |
| 1 | 2020-12-13 | 0.000000 | 5.857430e-07 | 0.487063 | 0.664559 |
| 2 | 2020-12-14 | 0.000022 | 3.593827e-06 | 0.168529 | 0.556242 |
| 3 | 2020-12-15 | 0.000068 | 8.166088e-04 | 0.529504 | 0.752410 |
| 4 | 2020-12-16 | 0.006044 | 1.283449e-05 | 0.466506 | 0.524750 |
| ... | ... | ... | ... | ... | ... |
| 146 | 2021-05-27 | 0.976800 | 9.309871e-01 | 0.261149 | 0.730174 |
| 147 | 2021-05-28 | 0.975709 | 9.239965e-01 | 0.336346 | 0.724221 |
| 148 | 2021-05-29 | 0.987500 | 9.318942e-01 | 0.526006 | 0.725693 |
| 149 | 2021-05-30 | 1.000000 | 1.000000e+00 | 0.339861 | 0.711905 |
| 150 | 2021-05-31 | 0.860602 | 6.990501e-01 | 0.428659 | 0.744424 |
151 rows × 5 columns
from rdd import rdd
# Regression analysis of Total vaccinations and Positive tweets
model = rdd.rdd(df_all, 'total_vaccinations', 'Positive_scale')
print(model.fit().summary())
Estimation Equation: Positive_scale ~ TREATED + total_vaccinations
WLS Regression Results
==============================================================================
Dep. Variable: Positive_scale R-squared: 0.011
Model: WLS Adj. R-squared: 0.004
Method: Least Squares F-statistic: 1.662
Date: Mon, 07 Jun 2021 Prob (F-statistic): 0.199
Time: 23:10:09 Log-Likelihood: 33.099
No. Observations: 151 AIC: -62.20
Df Residuals: 149 BIC: -56.16
Df Model: 1
Covariance Type: nonrobust
======================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------
Intercept 0.2352 0.010 22.734 0.000 0.215 0.256
TREATED 0.2352 0.010 22.734 0.000 0.215 0.256
total_vaccinations -0.0755 0.059 -1.289 0.199 -0.191 0.040
==============================================================================
Omnibus: 4.937 Durbin-Watson: 1.925
Prob(Omnibus): 0.085 Jarque-Bera (JB): 4.702
Skew: 0.431 Prob(JB): 0.0953
Kurtosis: 3.070 Cond. No. 1.78e+15
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 9.78e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
# Regression analysis of Total vaccinations and Negative tweets
model = rdd.rdd(df_all, 'total_vaccinations', 'Negative_scale')
print(model.fit().summary())
Estimation Equation: Negative_scale ~ TREATED + total_vaccinations
WLS Regression Results
==============================================================================
Dep. Variable: Negative_scale R-squared: 0.037
Model: WLS Adj. R-squared: 0.031
Method: Least Squares F-statistic: 5.799
Date: Mon, 07 Jun 2021 Prob (F-statistic): 0.0173
Time: 23:10:09 Log-Likelihood: 73.923
No. Observations: 151 AIC: -143.8
Df Residuals: 149 BIC: -137.8
Df Model: 1
Covariance Type: nonrobust
======================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------
Intercept 0.3068 0.008 38.858 0.000 0.291 0.322
TREATED 0.3068 0.008 38.858 0.000 0.291 0.322
total_vaccinations 0.1076 0.045 2.408 0.017 0.019 0.196
==============================================================================
Omnibus: 31.704 Durbin-Watson: 1.513
Prob(Omnibus): 0.000 Jarque-Bera (JB): 74.754
Skew: -0.866 Prob(JB): 5.85e-17
Kurtosis: 5.981 Cond. No. 1.78e+15
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 9.78e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
# Regression analysis of Daily vaccinations and Positive tweets
model = rdd.rdd(df_all, 'daily_vaccinations', 'Positive_scale')
print(model.fit().summary())
Estimation Equation: Positive_scale ~ TREATED + daily_vaccinations
WLS Regression Results
==============================================================================
Dep. Variable: Positive_scale R-squared: 0.014
Model: WLS Adj. R-squared: 0.007
Method: Least Squares F-statistic: 2.105
Date: Mon, 07 Jun 2021 Prob (F-statistic): 0.149
Time: 23:10:10 Log-Likelihood: 33.321
No. Observations: 151 AIC: -62.64
Df Residuals: 149 BIC: -56.61
Df Model: 1
Covariance Type: nonrobust
======================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------
Intercept 0.2397 0.012 20.001 0.000 0.216 0.263
TREATED 0.2397 0.012 20.001 0.000 0.216 0.263
daily_vaccinations -0.0838 0.058 -1.451 0.149 -0.198 0.030
==============================================================================
Omnibus: 4.304 Durbin-Watson: 1.930
Prob(Omnibus): 0.116 Jarque-Bera (JB): 4.052
Skew: 0.400 Prob(JB): 0.132
Kurtosis: 3.060 Cond. No. 1.77e+15
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.01e-28. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
# Regression analysis of Daily vaccinations and Negative tweets
model = rdd.rdd(df_all, 'daily_vaccinations', 'Negative_scale')
print(model.fit().summary())
Estimation Equation: Negative_scale ~ TREATED + daily_vaccinations
WLS Regression Results
==============================================================================
Dep. Variable: Negative_scale R-squared: 0.041
Model: WLS Adj. R-squared: 0.035
Method: Least Squares F-statistic: 6.380
Date: Mon, 07 Jun 2021 Prob (F-statistic): 0.0126
Time: 23:10:10 Log-Likelihood: 74.206
No. Observations: 151 AIC: -144.4
Df Residuals: 149 BIC: -138.4
Df Model: 1
Covariance Type: nonrobust
======================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------
Intercept 0.3017 0.009 32.998 0.000 0.284 0.320
TREATED 0.3017 0.009 32.998 0.000 0.284 0.320
daily_vaccinations 0.1113 0.044 2.526 0.013 0.024 0.198
==============================================================================
Omnibus: 31.247 Durbin-Watson: 1.520
Prob(Omnibus): 0.000 Jarque-Bera (JB): 74.820
Skew: -0.846 Prob(JB): 5.66e-17
Kurtosis: 6.005 Cond. No. 1.77e+15
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.01e-28. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.